Preprocessing QC statistics

Noam, July 2023

In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
import os
MOMAPS_HOME = '/home/labs/hornsteinlab/Collaboration/MOmaps_Noam/MOmaps'
MOMAPS_DATA_HOME = '/home/labs/hornsteinlab/Collaboration/MOmaps'
LOGS_PATH = os.path.join(MOMAPS_HOME, 'outputs','preprocessing','spd','logs', 'microglia')
PLOT_PATH = os.path.join(MOMAPS_HOME, 'src', 'preprocessing', 'notebooks','figures','microglia')
os.chdir(MOMAPS_HOME)
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="whitegrid", font_scale=1.5)
sns.color_palette("husl", 8)
plt.rcParams["image.cmap"] = "Set1"
from tqdm.notebook import tqdm
from src.common.lib.preprocessing_utils import rescale_intensity
from src.common.lib.images_qc import *
#sys.path.insert(1, "/home/labs/hornsteinlab/Collaboration/MOmaps_Sagy/MOmaps/src/common/lib")
import contextlib
import io
import matplotlib
import warnings
warnings.filterwarnings('ignore', category=pd.core.common.SettingWithCopyWarning)
from src.common.lib.qc_config_tmp import *
In [3]:
df = log_files_qc(LOGS_PATH)
Total of 3 files were read.
Before dup handeling  (50946, 20)
After duplication removal #1: (50946, 21)
After duplication removal #2: (50946, 21)

validate folder structure and files existence

In [4]:
# choose batches
batches = [f'batch{i}' for i in range (2,5)]
batches
Out[4]:
['batch2', 'batch3', 'batch4']

Raw Files

In [7]:
root_directory_raw = os.path.join(MOMAPS_DATA_HOME, 'input', 'images', 'raw', 'SpinningDisk','microglia_sort')

raws = run_validate_folder_structure(root_directory_raw, False, panels, markers,
                                     PLOT_PATH,marker_info,
                                    microglia_cell_lines_to_cond, reps, 
                                     microglia_cell_lines_for_disp, expected_dapi_raw,
                                     batches=batches)
batch2
Folder structure is valid.
Some files are bad:
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/raw/SpinningDisk/microglia_sort/batch2/SCNA/panelA/Untreated/rep2/G3BP1, R11_w3confCy5_s1533.tif small size (0.008 kB)
Total Sites:  57331
========
batch3
Folder structure is valid.
Some files are bad:
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/raw/SpinningDisk/microglia_sort/batch3/SCNA/panelC/Untreated/rep1/FMRP, R11_w2confmCherry_s1413.tif small size (0.008 kB)
Total Sites:  56850
========
batch4
Folder structure is valid.
All files exists.
Total Sites:  52800
========
====================

Processed

In [8]:
root_directory_proc = os.path.join(MOMAPS_DATA_HOME, 'input', 'images', 'processed', 'spd2',
                              'SpinningDisk','microglia')
procs = run_validate_folder_structure(root_directory_proc, True, panels, 
                                      markers,PLOT_PATH,marker_info,
                                    microglia_cell_lines_to_cond, reps, microglia_cell_lines_for_disp, expected_dapi_raw,
                                     batches=batches)
batch2
Folder structure is valid.
All files exists.
Total Sites:  15022
========
batch3
Folder structure is valid.
All files exists.
Total Sites:  11002
========
batch4
Folder structure is invalid. Missing paths:
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/microglia/batch4/OPTN/Untreated/PSD95
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/microglia/batch4/OPTN/Untreated/NEMO
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/microglia/batch4/OPTN/Untreated/GM130
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/microglia/batch4/OPTN/Untreated/NCL
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/microglia/batch4/OPTN/Untreated/ANXA11
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/microglia/batch4/OPTN/Untreated/Calreticulin
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/microglia/batch4/OPTN/Untreated/mitotracker
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/microglia/batch4/OPTN/Untreated/CLTC
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/microglia/batch4/OPTN/Untreated/DCP1A
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/microglia/batch4/OPTN/Untreated/TOMM20
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/microglia/batch4/OPTN/Untreated/FUS
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/microglia/batch4/OPTN/Untreated/SCNA
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/microglia/batch4/OPTN/Untreated/LAMP1
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/microglia/batch4/OPTN/Untreated/TIA1
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/microglia/batch4/OPTN/Untreated/PML
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/microglia/batch4/OPTN/Untreated/PEX14
All files exists.
Total Sites:  10293
========
====================

Difference between Raw and Processed

In [16]:
display_diff(batches, raws, procs, PLOT_PATH)
batch2
========
batch3
========
batch4
========
In [10]:
#for batch in list(range(3,9)) + ['7_16bit','8_16bit','9_16bit']: #problem with batch9: files that are 1 bytes!!!  
for batch in batches:
    with contextlib.redirect_stdout(io.StringIO()):
        var = sample_and_calc_variance(root_directory_proc, batch, 
                                       sample_size_per_markers=200, num_markers=26,
                                      cond_count=1, rep_count=len(reps))
    print(f'{batch} var: ',var)
batch2 var:  0.010164931452749274
batch3 var:  0.009063675934677664
batch4 var:  0.009132239572094056

Number of sites in each batch and cell line

In [20]:
plot_sites_count(df, expected_raw, microglia_lines_order, microglia_custom_palette, split_to_reps=True)

Number of Cells in Site for each batch and cell line

In [21]:
df_no_empty_sites = df[df.n_valid_tiles !=0]
plot_cell_count(df_no_empty_sites, microglia_lines_order, microglia_custom_palette, whole_cells=True)

plot_cell_count(df_no_empty_sites, microglia_lines_order, microglia_custom_palette, whole_cells=False)
# can add norm=True to norm by max

number of valid tiles per image (site)

In [19]:
plot_n_valid_tiles_count(df, microglia_custom_palette,reps, batch_min=2, batch_max=4)

Heatmap QC per batch, panel and cell line(tiles that passed QC condition)

In [22]:
plot_hm(df, split_by='rep', rows='cell_line', columns='panel')

Assessing Staining Reproducibility and Outliers

In [ ]:
for batch in batches:
    print(batch)
    run_calc_hist_new(f'microglia_sort/{batch}',microglia_cell_lines_for_disp,
                      markers,hist_sample=10,
                               cond_count=1, rep_count=len(reps),
                             sample_size_per_markers=200)
    print("="*30)
batch2
==============================
batch3
==============================
batch4
In [ ]:
# save notebook as HTML ( the HTML will be saved in the same folder the original script is)
from IPython.display import display, Javascript
display(Javascript('IPython.notebook.save_checkpoint();'))
os.system('jupyter nbconvert --to html src/preprocessing/notebooks/cell_count_stats_analysis_microglia.ipynb')
In [ ]: